1 Setup

## load relevant libraries and functions
require(knitr)         # for knitting
library(Hmisc)         # for descriptives
library(png)           # for working with images
library(grid)
library(DT)            # for data tables
library(tidyverse)     # for everything else
library(ggeffects)     # for regression outputs
library(broom.mixed)
library(emmeans)
library(generics)
library(broom)
library(modelr)
library(lme4)          # for mixed effects models
library(ltm)           # for IRT stuff
library(TAM)
library(psych)
library(ggplotify)     # for plotting
library(patchwork)

## set default code chunk options
knitr::opts_chunk$set(echo = T, warning = F, message = F)

## set default plot theme and colors
theme_set(theme_classic(base_size = 18))

## fix print width for knitted doc
options(width = 70)

## suppress warnings about grouping 
options(dplyr.summarise.inform = F)
options(xtable.floating = F)
options(xtable.timestamp = "")

## set random seed
set.seed(1)

## set directories for plots and data outputs
figures_dir = '../figures/'
data_dir = '../data/'

2 Phase 1: Exploring and visualizing the data

2.1 Load and examine the data

# Import data as downloaded from https://data-visualization-benchmark.s3.us-west-2.amazonaws.com/vt-fusion/psych_139_mini_project_1_split_80_responses.csv

df.raw =
  read.csv(paste0(data_dir,
                  "psych_139_mini_project_1_split_80_responses.csv"))

# run quick data summary
skimr::skim(df.raw)
Table 2.1: Data summary
Name df.raw
Number of rows 15616
Number of columns 15
_______________________
Column type frequency:
character 12
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
participant_id 0 1 24 24 0 426 0
question_text 0 1 30 375 0 150 0
image_url 0 1 83 92 0 163 0
test_name 0 1 3 5 0 5 0
graph_type 0 1 3 16 0 72 0
task_category 0 1 3 25 0 26 0
possible_responses 0 1 13 327 0 124 0
table_presentation_format 0 1 4 5 0 2 0
participant_response 0 1 0 112 356 403 0
task_type_merged 0 1 20 22 0 3 0
correct_answer 0 1 1 112 0 122 0
misleading_item 0 1 4 5 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
X 0 1 9795.11 5651.72 82 4830.75 9649.5 14721.25 19526 ▇▇▇▇▇
item_id 0 1 114.91 66.59 1 57.00 114.0 174.00 229 ▇▇▇▇▇
correct_response 0 1 0.63 0.48 0 0.00 1.0 1.00 1 ▅▁▁▁▇

2.2 Data Wrangling

# convert data to wide format so each row represents one participant (N = 426)

df.wide =
  df.raw %>%
  # grab response values for each question from "correct_response" col
  pivot_wider(id_cols = "participant_id",
              names_from = c("question_text"),
              values_from = c("correct_response"),
              values_fn = mean)

# print top rows
head(df.wide, n = 10)
## # A tibble: 10 × 151
##    participant_id        Which month has 25 m…¹ How many months have…²
##    <chr>                                  <dbl>                  <dbl>
##  1 6631326877f343bf84e1…                      1                      1
##  2 66abd9806349818b3dab…                      1                      1
##  3 610898813e48dc286496…                      0                     NA
##  4 62e2deadd18974300540…                      1                     NA
##  5 63c6d6d5e132610bf009…                      1                      1
##  6 64136da866ff27f7bf98…                      0                      1
##  7 6737d3a1eea5bb6f29a9…                      1                      1
##  8 6159fb27b6340dda3fc6…                      0                      1
##  9 6760dca655eab7da158e…                      1                      1
## 10 677e75b259b7f06d0c32…                      1                      1
## # ℹ abbreviated names: ¹​`Which month has 25 mm of rain?`,
## #   ²​`How many months have less than 40mm of rain?`
## # ℹ 148 more variables: `Which season has the most rain?` <dbl>,
## #   `In which season does each month have less rain than the month before?` <dbl>,
## #   `Which season has more rain, the summer or the spring?` <dbl>,
## #   `In San Francisco, as the weather gets warmer, there is generally more rain.` <dbl>,
## #   `How much does it rain in March?` <dbl>, …

2.3 Preliminary Item Difficulty Analysis: % correct

Across all tests, we have a total of 184 items (150 questions), with about 80-90 responses per item.

# Create summary df: For each question, compute % of participants who responded correctly (mean), SD, and SE

df.wide_summary =
  df.raw %>%
  group_by(question_text, test_name, graph_type, task_type_merged) %>%
  # compute mean
  dplyr::summarise(mean_correct = mean(correct_response,
                                       na.rm = T),
                   # compute SD
                   sd_correct = sd(correct_response,
                                   na.rm = T),
                   # compute number of responses
                   n = n(),
                   # compute SE (SD / sqrt of n)
                   se_correct = sd_correct/sqrt(n)) %>%
  ungroup()

# print top rows
head(df.wide_summary, n = 10)
## # A tibble: 10 × 8
##    question_text    test_name graph_type task_type_merged mean_correct
##    <chr>            <chr>     <chr>      <chr>                   <dbl>
##  1 A group of male… VLAT      Scatter    statistical inf…        0.782
##  2 A group of the … VLAT      Scatter    statistical inf…        0.616
##  3 About how much … VLAT      Line       arithmetic comp…        0.784
##  4 About what is t… VLAT      Pie        value identific…        0.619
##  5 About what is t… VLAT      Stacked B… value identific…        0.494
##  6 About what was … VLAT      Stacked A… value identific…        0.273
##  7 According to yo… GGR       Line       statistical inf…        0.494
##  8 Approximately w… GGR       Line       value identific…        0.831
##  9 Approximately w… GGR       Pie        arithmetic comp…        0.671
## 10 Approximately, … CALVI     Stacked A… arithmetic comp…        0.176
## # ℹ 3 more variables: sd_correct <dbl>, n <int>, se_correct <dbl>

Below, we evaluate means and SDs per test:

mean(df.wide_summary$mean_correct)
## [1] 0.6288262
sd(df.wide_summary$mean_correct)
## [1] 0.2738472
df.wide_summary %>%
  group_by(test_name) %>%
  summarise(mean = mean(mean_correct),
            sd = sd(mean_correct),
            min = min(mean_correct),
            max = max(mean_correct))
## # A tibble: 5 × 5
##   test_name  mean    sd    min   max
##   <chr>     <dbl> <dbl>  <dbl> <dbl>
## 1 BRBF      0.699 0.214 0.2    0.988
## 2 CALVI     0.435 0.285 0.0120 0.964
## 3 GGR       0.620 0.291 0.129  0.943
## 4 VLAT      0.657 0.259 0.0723 0.988
## 5 WAN       0.795 0.192 0.227  0.989

2.3.1 Viz 1: % correct for single items

The plot below shows that as expected, even within a single test, the items varied considerably in their difficulty (as measured by % correct).

df.wide_summary %>%
  # question on x-axis, ordered by % correct
  ggplot(aes(x = reorder(question_text, mean_correct),
             # % correct on y-axis
             y = mean_correct*100),
         # color each bar based on graph type
         color = graph_type) +
  # add CIs based on 1.96*SE
  geom_errorbar(
    aes(ymin = (mean_correct - (1.96*se_correct))*100,
        ymax = (mean_correct + (1.96*se_correct))*100),
    width = 2,
    color = "black",
    alpha = .7,
    size = 1) +
  # plot one point per question/item
  geom_point(aes(fill = graph_type),
             shape = 21, color = "black", size = 2) +
  # facet by task type and test
  facet_wrap(~ task_type_merged + test_name,
             ncol = 5, scales = "free_y") +
  # adjust x-axis label orientation for readability
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1,
                                   size = 1),
        legend.position = "none") +
  # fix scale to be 0-100, breaks of 25
  scale_y_continuous(limits = c(0, 100),
                     breaks = seq(0, 100, 25)) +
  # add title and axis labels
  labs(title = "Individual Item Difficulty by Task Type & Test",
       x = "Question",
       y = "% correct")

# export plot
ggsave((paste0(figures_dir,
               "pct_correct_tasktype_test.png")),
       width = 20, height = 10, device = "png")

2.3.2 Viz 2: % correct averages by task type

The plot below shows that even though different tests contained items of the same “task type”, their respective item difficulty still varied across tests.

# create new sub-dataframe: Within a given test, collapse across a given task type and compute % of participants who responded correctly (mean), SD, and SE
df.task_summary_tasktype =
  df.raw %>%
  # group by test and task type
  group_by(test_name, task_type_merged) %>%
  # compute mean
  summarise(mean_correct = mean(correct_response, na.rm = T),
            # compute SD
            sd_correct = sd(correct_response, na.rm = T),
            # compute number of responses
            n = n(),
            # compute SE
            se_correct = sd_correct / sqrt(n))
df.task_summary_tasktype %>%
  # test type on x-axis
  ggplot(aes(x = test_name,
             # % correct on y-axis
             y = (mean_correct*100),
             # color by test name
             fill = test_name,
             color = test_name)) +
  # add CIs based on 1.96*SE
  geom_errorbar(aes(ymin = (mean_correct - (1.96*(1.96*se_correct)))*100,
              ymax = (mean_correct + (1.96*se_correct))*100),
          position = position_dodge(width = 0.9),
          width = 0.1,
          color = "black") +
  # Add one point for each % correct collapsed across task type
  geom_point(position = position_dodge(width = 0.9),
             width = 1,
             size = 3) +
  # Adjust scales to be 0-100
  scale_y_continuous(limits = c(0, 100),
                     breaks = seq(0, 100, 25),
                     labels = scales::percent_format(scale = 1)) +
  # Add title and axis labels
  labs(title = "% correct averaged across Task Type",
       x = "Task Type",
       y = "% correct",
       fill = "Test") +
  # Adjust theme
  theme_minimal(base_size = 18) +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "bottom") +
  guides(fill = "none") +
  # Facet by task type
  facet_grid(~task_type_merged)

# export plot
ggsave((paste0(figures_dir,
               "pct_correct_tasktype.png")),
       width = 10, height = 6, device = "png")

2.3.3 Descriptives: Single-item descriptives for % correct

In the output below, we see that on some items, as few as 1% of participants selected the correct response (very difficult items), and on other items, as many as 99% of participants selected the correct response (very easy items).

Across items, SDs of % correct ranged from 10.6% to 50%.

## Across dataset, compute means and SDs as well as mins and maxes for each respective statistic
df.wide_summary %>%
  dplyr::select(mean_correct, sd_correct) %>%
  summary()
##   mean_correct       sd_correct    
##  Min.   :0.01205   Min.   :0.1066  
##  1st Qu.:0.40964   1st Qu.:0.3258  
##  Median :0.70177   Median :0.4142  
##  Mean   :0.62883   Mean   :0.3870  
##  3rd Qu.:0.85882   3rd Qu.:0.4802  
##  Max.   :0.98864   Max.   :0.5029
# Print 10 questions with lowest average % correct
df.wide_summary %>%
  dplyr::select(question_text, mean_correct, sd_correct) %>%
  arrange(desc(mean_correct)) %>%
  tail(n = 10) %>%
  print()
## # A tibble: 10 × 3
##    question_text                               mean_correct sd_correct
##    <chr>                                              <dbl>      <dbl>
##  1 What was the number of girls named 'Amelia…       0.145       0.354
##  2 Assuming today is May 10 and given the cha…       0.141       0.350
##  3 Which of the treatments contributes to a l…       0.129       0.338
##  4 What is the range in weight for the 85 mal…       0.0833      0.278
##  5 Do U.S. states have a more similar number …       0.0814      0.275
##  6 What was the unemployment rate for Indiana…       0.0723      0.261
##  7 Are there more households with exactly 1 c…       0.0575      0.234
##  8 Assuming today is May 01, 2022, which of t…       0.0488      0.217
##  9 Does any members of species C weigh more t…       0.0233      0.152
## 10 Does cell phone brand A have more than hal…       0.0120      0.110
# Print 10 questions with highest average % correct
df.wide_summary %>%
  dplyr::select(question_text, mean_correct, sd_correct) %>%
  arrange(desc(mean_correct)) %>%
  head(n = 10) %>%
  print()
## # A tibble: 10 × 3
##    question_text                               mean_correct sd_correct
##    <chr>                                              <dbl>      <dbl>
##  1 Which month has 25 mm of rain?                     0.989      0.107
##  2 In which company is the global smartphone …        0.988      0.108
##  3 Do all of these planets have the same life…        0.988      0.109
##  4 In which country is the average internet s…        0.976      0.153
##  5 Which planet has the highest life expectan…        0.976      0.153
##  6 Over the course of years between 2009 and …        0.976      0.154
##  7 Do all of these planets have the same life…        0.966      0.184
##  8 What is the trend of the monthly number of…        0.964      0.187
##  9 Which season has the most rain?                    0.964      0.187
## 10 Do all of these planets have the same life…        0.964      0.188
# Print 10 questions with least variability
df.wide_summary %>%
  dplyr::select(question_text, mean_correct, sd_correct) %>%
  arrange(desc(sd_correct)) %>%
  tail(n = 10) %>%
  print()
## # A tibble: 10 × 3
##    question_text                               mean_correct sd_correct
##    <chr>                                              <dbl>      <dbl>
##  1 What is the trend of the monthly number of…       0.964       0.187
##  2 Do all of these planets have the same life…       0.966       0.184
##  3 Over the course of years between 2009 and …       0.976       0.154
##  4 In which country is the average internet s…       0.976       0.153
##  5 Which planet has the highest life expectan…       0.976       0.153
##  6 Does any members of species C weigh more t…       0.0233      0.152
##  7 Does cell phone brand A have more than hal…       0.0120      0.110
##  8 Do all of these planets have the same life…       0.988       0.109
##  9 In which company is the global smartphone …       0.988       0.108
## 10 Which month has 25 mm of rain?                    0.989       0.107
# Print 10 questions with most variability
df.wide_summary %>%
  dplyr::select(question_text, mean_correct, sd_correct) %>%
  arrange(desc(sd_correct)) %>%
  head(n = 10) %>%
  print()
## # A tibble: 10 × 3
##    question_text                               mean_correct sd_correct
##    <chr>                                              <dbl>      <dbl>
##  1 About what is the ratio of the cost of a s…        0.494      0.503
##  2 According to your best guess, what will th…        0.494      0.503
##  3 The approval rating of Republicans for the…        0.494      0.503
##  4 The residents of town Y are voting on a fa…        0.512      0.503
##  5 What is the height for a person who lies o…        0.512      0.503
##  6 What is the trend of the average price of …        0.517      0.503
##  7 How many planets spend approximately the s…        0.476      0.502
##  8 How many planets spend approximately the s…        0.476      0.502
##  9 In which city is the cost of soda the high…        0.535      0.502
## 10 What has the general average unemployment …        0.534      0.502

3 Phase 2: Defining & evaluating statistical models

In the dataset we examined above, we have data from a series of tests that (supposedly) tap data viz literacy. Notably, not all items in and across these tests are the same. In any test, you want a variety of items. Based on this variety, it’s likely some items will be harder (and some will be easier) than others. I begin by performing some sanity checks that assess whether the items we would expect to be harder were indeed harder for participants (as per lower % correct).

3.1 RQ1: Are items that were designed to mislead harder than those that were not?

That is, does misleading nature (categorical; yes vs. no) predict % correct?

Note: Only CALVI included misleading items. Thus, we examine data only from this test (4,082 responses from N = 425 participants).

Prediction: Misleading items are lower % correct (i.e., higher difficulty) than non-misleading items.

Method: Mixed effects logistic regression, with a fixed effect for item nature and a random intercept per participant.

# Extract relevant subset of data
df.misleading =
  df.raw %>%
  # Keep only relevant cols
  dplyr::select(participant_id, question_text, test_name, graph_type, task_type_merged, correct_response, misleading_item) %>%
  # Filter for CALVI data
  filter(test_name == "CALVI") %>%
  # Convert "misleading nature" into factor (True vs false)
  mutate(misleading_item = factor(misleading_item))

Out of the 48 items we have data on in CALVI, 36 are misleading (75%), and 12 are not (25%).

# count misleading items
df.misleading %>%
  distinct(question_text, misleading_item) %>%
  count(misleading_item)
##   misleading_item  n
## 1           False 12
## 2            True 36

3.1.1 Descriptives: Mean and SD for misleading vs. non-misleading items

df.misleading %>%
  group_by(misleading_item) %>%
  summarise(mean = mean(correct_response, na.rm = T),
            sd = sd(correct_response, na.rm = T),
            n = n())
## # A tibble: 2 × 4
##   misleading_item  mean    sd     n
##   <fct>           <dbl> <dbl> <int>
## 1 False           0.700 0.459  1022
## 2 True            0.346 0.476  3060

3.1.2 Viz 3: % correct for misleading vs non-misleading items

In line with the descriptive stats above, the plot shows that at first glance, the average % correct for misleading items seems to be lower than the average % correct for non-misleading items.

# Plot the raw data
df.misleading %>%
  # misleading nature (true vs false) on x-axis
  ggplot(aes(x = misleading_item,
             # % correct on y-axis
             y = (correct_response*100),
             # color by misleading nature
             fill = misleading_item)) +
  # add raw data points
  geom_point(position = position_jitter(width = .3,
                                        height = 0),
             shape = 21,
             size = 2,
             alpha = .3) +
  # add bootstrapped CIs for error bars
  stat_summary(fun.data = "mean_cl_boot",
               geom = "pointrange",
               color = "black",
               shape = 21,
               size = 1,
               linewidth = 3) +
  # adjuts scales
  scale_y_continuous(limits = c(0, 100),
                     breaks = seq(0, 100, 25),
                     labels = scales::percent_format(scale = 1)) +
  # add color scale for fill
  scale_fill_brewer(palette = "Pastel2") +
  # Adjust title and axis labels
  labs(title = "% correct by misleading vs. non-misleading items",
       x = "Misleading Item",
       y = "% correct",
       fill = "Misleading?") +
  # Adjust theme
  theme_classic(base_size = 36) +
  theme(legend.position = "none")

# export plot
ggsave((paste0(figures_dir,
               "pct_correct_misleading.png")),
       width = 16, height = 8, device = "png")

3.1.3 Model A: Mixed Effects Logistic Regression: % correct ~ misleading

The logistic regression analysis shows that compared to a minimal model that fits the mean regardless of item nature (misleading vs. not misleading), the model including item nature improved fit to data:

\(χ^2\) = 356.6, df = 1, β = -1.51, SE = .080, p < .001; RMSE_model = .463 vs RMSE_baseline = 0.496.

Predicted % correct for non-misleading items was about twice as large (70%; 95% CIs = [.67, .73]) as % correct for misleading items (34%; 95% CIs = [.32, .36]).

df.misleading =
  df.misleading %>%
  mutate(participant_id = factor(participant_id))

# Run empty model fitting the mean
model.misleading_empty =
  glm(correct_response ~
          # intercept
          1,
      data = df.misleading)

model.misleading_empty %>%
  summary()
## 
## Call:
## glm(formula = correct_response ~ 1, data = df.misleading)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.434346   0.007759   55.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2457497)
## 
##     Null deviance: 1002.9  on 4081  degrees of freedom
## Residual deviance: 1002.9  on 4081  degrees of freedom
## AIC: 5858.4
## 
## Number of Fisher Scoring iterations: 2
# RMSE
rmse_empty =
  # square root
  sqrt(
    # take mean
    mean((df.misleading$correct_response -
            predict(model.misleading_empty, type = "response"))^2
         # errors squared
         )
    )

print(rmse_empty)
## [1] 0.4956708
########################

# Run augmented model adding predictor of interest (misleading nature)
model.misleading_augmented =
  glmer(correct_response ~
        # intercept
        1 +
        # fixed effect for item nature
        misleading_item +
        # random intercept for participant
        (1 | participant_id),
      data = df.misleading,
      family = "binomial")

# Print regression output
model.misleading_augmented %>%
  summary()
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## correct_response ~ 1 + misleading_item + (1 | participant_id)
##    Data: df.misleading
## 
##      AIC      BIC   logLik deviance df.resid 
##   5195.3   5214.2  -2594.6   5189.3     4079 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.7064 -0.7271 -0.6680  1.2000  1.5747 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  participant_id (Intercept) 0.08683  0.2947  
## Number of obs: 4082, groups:  participant_id, 425
## 
## Fixed effects:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          0.86159    0.07082   12.17   <2e-16 ***
## misleading_itemTrue -1.51333    0.08013  -18.89   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## msldng_tmTr -0.856
model.misleading_augmented %>%
  joint_tests()
##  model term      df1 df2 F.ratio  Chisq p.value
##  misleading_item   1 Inf 356.640 356.64  <.0001
# Show estimates of augmented model
model.misleading_augmented %>%
  ggpredict()
## $misleading_item
## # Predicted probabilities of correct_response
## 
## misleading_item | Predicted |     95% CI
## ----------------------------------------
## False           |      0.70 | 0.67, 0.73
## True            |      0.34 | 0.32, 0.36
## 
## Adjusted for:
## * participant_id = 0 (population-level)
## 
## attr(,"class")
## [1] "ggalleffects" "list"        
## attr(,"model.name")
## [1] "."
# Calculate model statistics
model.misleading_augmented %>% 
  broom.mixed::glance()
## # A tibble: 1 × 7
##    nobs sigma logLik   AIC   BIC deviance df.residual
##   <int> <dbl>  <dbl> <dbl> <dbl>    <dbl>       <int>
## 1  4082     1 -2595. 5195. 5214.    5052.        4079
# Compute RMSE
rmse_misleading =
  # square root
  sqrt(
    # take mean
    mean((df.misleading$correct_response -
            predict(model.misleading_augmented, type = "response"))^2
         # errors squared
         )
    )

print(rmse_misleading)
## [1] 0.4630246
# Compare empty model with augmented model
anova(model.misleading_empty, model.misleading_augmented)
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: correct_response
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev
## NULL                  4081     1002.9

3.1.4 Model Visualizations

# plot the data with line for empty model prediction (mean)
df.misleading %>%
  # x-axis is misleading nature
  ggplot(aes(x = misleading_item,
             # y-axis is % correct
             y = (correct_response*100),
             # color by misleading nature
             fill = misleading_item)) +
  # add raw data points
  geom_point(position = position_jitter(width = .3,
                                        height = 0),
             shape = 21,
             size = 2,
             alpha = .3) +
  # add bootstrapped CIs
  stat_summary(fun.data = "mean_cl_boot",
               geom = "pointrange",
               color = "black",
               shape = 21,
               size = 1,
               linewidth = 3) +
  # add horizontal line for mean
  geom_hline(yintercept = mean(df.misleading$correct_response)*100,
             color = "lightblue", size = 3) +
  # adjust scale
  scale_y_continuous(limits = c(0, 100),
                     breaks = seq(0, 100, 25),
                     labels = scales::percent_format(scale = 1)) +
  # specify color fill
  scale_fill_brewer(palette = "Pastel2") +
  # add title and axis labels
  labs(title = "% correct by misleading vs. non-misleading items",
       x = "Misleading Item",
       y = "% correct",
       fill = "Misleading?") +
  # adjust theme
  theme_classic(base_size = 36) +
  theme(legend.position = "none")

# plot the data with lines for augmented model predictions (separate fitted means for misleading and non-misleading items)
df.misleading %>%
  # x-axis = misleading nature
  ggplot(aes(x = misleading_item,
             # y-axis = % correct
             y = (correct_response*100),
             # fill by misleading nature
             fill = misleading_item)) +
  # add raw data points
  geom_point(position = position_jitter(width = .3,
                                        height = 0),
             shape = 21,
             size = 2,
             alpha = .3) +
  # add boostrapped CIs
  stat_summary(fun.data = "mean_cl_boot",
               geom = "pointrange",
               color = "black",
               shape = 21,
               size = 1,
               linewidth = 3) +
  # add separate fitted lines for each mean
  geom_hline(yintercept = mean(df.misleading$correct_response[df.misleading$misleading_item == "True"])*100,
             color = "orange", size = 3) +
    geom_hline(yintercept = mean(df.misleading$correct_response[df.misleading$misleading_item == "False"])*100,
           color = "lightgreen", size = 3) +
  # adjust scale
  scale_y_continuous(limits = c(0, 100),
                     breaks = seq(0, 100, 25),
                     labels = scales::percent_format(scale = 1)) +
  # specify color fill
  scale_fill_brewer(palette = "Pastel2") +
  # add title and axis labels
  labs(title = "% correct by misleading vs. non-misleading items",
       x = "Misleading Item",
       y = "% correct",
       fill = "Misleading?") +
  # adjust  theme
  theme_classic(base_size = 36) +
  theme(legend.position = "none")

3.2 RQ2: Are longer questions harder than shorter questions?

That is, does question length (continuous) predict % correct?

Note: I examined data from all tests (15,616 responses from N = 426 participants and defined question length as the number of characters in the question.

Prediction: Longer questions are lower % correct (i.e., higher difficulty) than short questions.

Method: Mixed effects linear regression, with a fixed effect for question length and a random intercept per participant.

# Extract relevant subset of data
df.length =
  df.raw %>%
  # Keep only relevant cols
  dplyr::select(participant_id, test_name, graph_type, task_type_merged, correct_response, question_text) %>%
  # Compute question length
  mutate(question_length = nchar(question_text))

# create summary df
df.length_summary =
  df.length %>%
  dplyr::select(question_text, correct_response) %>%
  group_by(question_text) %>%
  summarise(
    percent_correct = mean(correct_response, na.rm = TRUE) * 100,
    n = n(),
    question_length = nchar(first(question_text))
  ) %>%
  ungroup()

3.2.1 Descriptives: Mean and SD of question length

There is some variation in question length both within and across tests, with the average question length ranging from 48 to 95 characters.

# create summary statistics for question length variable
df.length %>%
  group_by(test_name) %>%
  summarise(mean = mean(question_length),
            sd = sd(question_length))
## # A tibble: 5 × 3
##   test_name  mean    sd
##   <chr>     <dbl> <dbl>
## 1 BRBF       85.6  34.9
## 2 CALVI      94.7  46.3
## 3 GGR        92.5  30.2
## 4 VLAT       77.5  26.1
## 5 WAN        47.7  16.3

3.2.2 Viz 4: % correct by question length

The plot shows that at first glance, most questions are clustered around question length 50-150. Even within that range, % correct is highly variable, and there is not much of a visually clear trend.

# Plot raw data
df.length_summary %>%
  # x-axis = question length
  ggplot(aes(x = question_length,
             # y-axis = % correct
             y = percent_correct)) +
  # plot raw data
  geom_point(position = position_jitter(width = .5,
                                        height = 0),
             size = 2,
             alpha = .3) +
  # adjust scale
  scale_y_continuous(limits = c(0, 100),
                     breaks = seq(0, 100, 25),
                     labels = scales::percent_format(scale = 1)) +
  xlim(0, 375) +
  # add axis labels
  labs(y = "% correct",
       x = "Question Length")

3.2.3 Model B: Mixed Effects Linear Regression Analysis: % correct ~ question length

The regression analysis shows that compared to a minimal model that fits the mean regardless of question length, the model including question length improved fit to data:

\(χ^2\) = 430, df = 1, β standardized = -.37, SE = .0001, p < .001; RMSE_model = .459 vs RMSE_baseline = 0.483.

On average, for every unit increase in question length (1 unit = 1 SD; standardized), % correct was predicted to increase by .2%.

df.length =
  df.length %>%
  mutate(participant_id = factor(participant_id),
         question_length = scale(question_length,
                                 center = T, scale = T))

# Run empty model fitting the mean
model.length_empty =
  glm(correct_response ~
        # intercept
        1,
      data = df.length)

model.length_empty %>%
  summary()
## 
## Call:
## glm(formula = correct_response ~ 1, data = df.length)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.628458   0.003867   162.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2335135)
## 
##     Null deviance: 3646.3  on 15615  degrees of freedom
## Residual deviance: 3646.3  on 15615  degrees of freedom
## AIC: 21606
## 
## Number of Fisher Scoring iterations: 2
# RMSE
rmse_empty =
  # square root
  sqrt(
    # take mean
    mean((df.length$correct_response -
            predict(model.length_empty, type = "response"))^2
         # errors squared
         )
    )

print(rmse_empty)
## [1] 0.4832169
########################

# Run augmented model adding predictor of interest (question length)
model.length_augmented =
  glmer(correct_response ~
          # intercept
          1 +
          # fixed effect for question length
          question_length +
          # random intercept for question
          (1 | participant_id),
        family = "binomial",
        data = df.length)

# Print regression output
model.length_augmented %>%
  summary()
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## correct_response ~ 1 + question_length + (1 | participant_id)
##    Data: df.length
## 
##      AIC      BIC   logLik deviance df.resid 
##  19790.9  19813.9  -9892.5  19784.9    15613 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.7999 -1.0772  0.5761  0.7388  5.0870 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  participant_id (Intercept) 0.2733   0.5227  
## Number of obs: 15616, groups:  participant_id, 426
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      0.56233    0.03077   18.28   <2e-16 ***
## question_length -0.37418    0.01948  -19.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## qustn_lngth -0.020
model.length_augmented %>%
  joint_tests(pbkrtest.limit = 15616)
##  model term      df1 df2 F.ratio   Chisq p.value
##  question_length   1 Inf 368.825 368.825  <.0001
# Calculate model statistics
model.length_augmented %>% 
  broom.mixed::glance()
## # A tibble: 1 × 7
##    nobs sigma logLik    AIC    BIC deviance df.residual
##   <int> <dbl>  <dbl>  <dbl>  <dbl>    <dbl>       <int>
## 1 15616     1 -9892. 19791. 19814.   19010.       15613
# Compute RMSE
rmse_questionlength =
  # square root
  sqrt(
    # take mean
    mean((df.length$correct_response -
            predict(model.length_augmented, type = "response"))^2
         # errors squared
         )
    )

print(rmse_questionlength)
## [1] 0.4586007
# Compare empty model with augmented model
anova(model.length_empty, model.length_augmented)
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: correct_response
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev
## NULL                 15615     3646.3

3.2.4 Model Visualizations

# plot empty model with mean
df.length %>%
  # x-axis = question length
  ggplot(aes(x = question_length,
             # y-axis = % correct
             y = correct_response)) +
  # plot raw data points
  geom_point(position = position_jitter(width = .5,
                                        height = 0),
             size = 2,
             alpha = .5) +
  # add fitted line for mean
  geom_hline(yintercept = mean(df.length$correct_response),
             color = "lightblue", size = 1) +
  # adjust axis labels
  labs(y = "Correct (y = 1, n = 0)",
       x = "Question Length")

# plot the data with a fitted line for the augmented model predictions
df.length %>%
  # question length on x-axis
  ggplot(aes(x = question_length,
             # % correct on y-axis
             y = correct_response)) +
  # add raw data
  geom_point(alpha = 0.6, size = 2) +
  # add regression line
  geom_smooth(method = "lm", se = TRUE, color = "lightblue") +
  # add title and axis labels
  labs(title = "% correct by question length",
       x = "Question Length",
       y = "Correct (y = 1, n = 0)")

3.3 Comparing Model A and Model B

We have to reduce the data set used for model A to just CALVI items in order to match the trial-level data that is fed into the model.

# Create new data sets to match trial-level data fed into model
df.comparison_model =
  df.raw %>%
  # Keep only relevant cols
  dplyr::select(participant_id, question_text, test_name, graph_type, task_type_merged, correct_response, misleading_item) %>%
  # Filter for CALVI data
  filter(test_name == "CALVI") %>%
  # Convert "misleading nature" into factor (True vs false)
  mutate(misleading_item = factor(misleading_item)) %>%
  # calculate question length
  mutate(question_length = nchar(question_text)) %>%
  # format relevant variables
  mutate(participant_id = factor(participant_id),
       question_length = scale(question_length,
                               center = T, scale = T))

# Fit models anew

# Model A
model.comparison.misleading =
  glmer(correct_response ~
          # intercept
          1 +
          # fixed effect for question length
          misleading_item +
          # random intercept for question
          (1 | participant_id),
        family = "binomial",
        data = df.comparison_model)

# Model B
model.comparison.length =
  glmer(correct_response ~
          # intercept
          1 +
          # fixed effect for question length
          question_length +
          # random intercept for question
          (1 | participant_id),
        family = "binomial",
        data = df.comparison_model)

# Compare model A with model B
anova(model.comparison.misleading, model.comparison.length)
## Data: df.comparison_model
## Models:
## model.comparison.misleading: correct_response ~ 1 + misleading_item + (1 | participant_id)
## model.comparison.length: correct_response ~ 1 + question_length + (1 | participant_id)
##                             npar    AIC    BIC  logLik deviance Chisq
## model.comparison.misleading    3 5195.3 5214.2 -2594.6   5189.3      
## model.comparison.length        3 5568.8 5587.7 -2781.4   5562.8     0
##                             Df Pr(>Chisq)
## model.comparison.misleading              
## model.comparison.length      0

3.4 RQ3: How informative and discriminative are the items in relation to data visualization literacy?

3.4.1 Item Discrimination and Item Information

# create new df
df.irt =
  df.raw

Next, we evaluate item information and item discrimination for all test items across all tests from an IRT perspective.

In IRT, item difficulty is defined as the level of latent ability (i.e., data visualization) or point on the ability scale (represented by theta) needed to have a 50% probability of getting the item right. Harder items require more ability to have a 50% probability of getting the item right. Easier items require less ability to have a 50% probability of getting the item right.

We fit two-parameter logistic (2PL) models, where the expected difference in performance (i.e., probability of getting the item right) depends not just on ability level but also the specific item in question. In other words, subjects’ responses are predicted by the (hypothesized) ability as well as item difficulty.

Item information represents each item’s ability to differentiate between test-takers based on ability level (theta). We plot item information using Item Characteristic Curves (ICCs), where the x-axis represents ability level (theta), and the y-axis represents probability of getting a correct response. Higher item information is better.

Item discrimination represents how strongly related a given item is to the latent ability in question as assessed by the broader test. Items with higher discrimination are better at differentiating test-takers. More formally, it is the slope of a tangent at the point of inflection of the item response function (IRF), or the point on the logistic curve at which the curvature changes signs.

3.4.1.1 WAN

In the plots below, we see that for many items, respondents very low on data visualization literacy still have a decent probability of choosing the correct answer. Items differ quite widely in their discrimination.

# Run for WAN
df.irt_wide = 
  df.irt %>%
  # Filter for WAN items
  filter(test_name == "WAN") %>%
  dplyr::select(participant_id, item_id, correct_response) %>%
  pivot_wider(names_from = item_id,
              values_from = correct_response) %>%
  dplyr::select(-c(participant_id))

# run 2pl model
model.2pl =
  # predict by difficulty + ability
  ltm(df.irt_wide ~ z1,
      # print IRT parameters (coefficient estimates of item difficulty and discrimination)
      IRT.param = TRUE,
      # 1000 iterations to yield estimates
      control = list(iter.em = 1000))

# item characteristic curves
icc_wan =
  as.ggplot(~plot(model.2pl,
     type = "ICC", zrange = c(-4, 4),
     main = "WAN"))

icc_wan

# item information plot
plot(model.2pl,
     type = "IIC", zrange = c(-4, 4))

# Wright map
fit_2pl_tam =
  tam.mml.2pl(df.irt_wide)

IRT.WrightMap(fit_2pl_tam)

3.4.1.2 GGR

In the plots below, we see that some items follow a step function. For some items, item difficulty is high (even with high levels of ability, respondents have a low estimated probability of selecting the correct answer. Again, items differ quite widely in their discrimination.

# Run for GGR
df.irt_wide = 
  df.irt %>%
  # Filter for GGR items
  filter(test_name == "GGR") %>%
  dplyr::select(participant_id, item_id, correct_response) %>%
  pivot_wider(names_from = item_id,
              values_from = correct_response) %>%
  dplyr::select(-c(participant_id))

# run 2pl model
model.2pl =
  # predict by difficulty + ability
  ltm(df.irt_wide ~ z1,
      # print IRT parameters (coefficient estimates of item difficulty and discrimination)
      IRT.param = TRUE,
      # 1000 iterations to yield estimates
      control = list(iter.em = 1000))

# item characteristic curves
icc_ggr =
  as.ggplot(~plot(model.2pl,
     type = "ICC", zrange = c(-4, 4),
     main = "GGR"))

icc_ggr

# item information plot
plot(model.2pl,
     type = "IIC", zrange = c(-4, 4))

# Wright map
fit_2pl_tam =
  tam.mml.2pl(df.irt_wide)

IRT.WrightMap(fit_2pl_tam)

3.4.1.3 BRBF

The algorithm did not converge.

# Run for BRBF
df.irt_wide = 
  df.irt %>%
  # filter for BRBF items
  filter(test_name == "BRBF") %>%
  dplyr::select(participant_id, item_id, correct_response) %>%
  pivot_wider(names_from = item_id,
              values_from = correct_response) %>%
  dplyr::select(-c(participant_id))

# run 2pl model
model.2pl =
  # predict by difficulty + ability
  ltm(df.irt_wide ~ z1,
      # print IRT parameters (coefficient estimates of item difficulty and discrimination)
      IRT.param = TRUE,
      # 1000 iterations to yield estimates
      control = list(iter.em = 1000))

# item characteristic curves
icc_brbf =
  as.ggplot(~plot(model.2pl,
     type = "ICC", zrange = c(-4, 4),
     main = "BRBF"))

icc_brbf

# item information plot
plot(model.2pl,
     type = "IIC", zrange = c(-4, 4),)

# Wright map
fit_2pl_tam =
  tam.mml.2pl(df.irt_wide)

IRT.WrightMap(fit_2pl_tam)

3.4.1.4 VLAT

In the plots below, we see that for many items, respondents low on data visualization literacy still have a decent probability of choosing the correct answer. Items differ quite widely in their discrimination. For some items, item difficulty is high (even with high levels of ability, respondents have a low estimated probability of selecting the correct answer.

# Run for VLAT
df.irt_wide = 
  df.irt %>%
  # filter for VLAT items
  filter(test_name == "VLAT") %>%
  dplyr::select(participant_id, item_id, correct_response) %>%
  pivot_wider(names_from = item_id,
              values_from = correct_response) %>%
  dplyr::select(-c(participant_id))

# run 2pl model
model.2pl =
  # predict by difficulty + ability
  ltm(df.irt_wide ~ z1,
      # print IRT parameters (coefficient estimates of item difficulty and discrimination)
      IRT.param = TRUE,
      # 1000 iterations to yield estimates
      control = list(iter.em = 1000))

# item characteristic curves
icc_vlat =
  as.ggplot(~plot(model.2pl,
     type = "ICC", zrange = c(-4, 4),
     main = "VLAT"))

icc_vlat

# item information plot
plot(model.2pl,
     type = "IIC", zrange = c(-4, 4))

# Wright map
fit_2pl_tam =
  tam.mml.2pl(df.irt_wide)

IRT.WrightMap(fit_2pl_tam)

3.4.1.5 CALVI

In the plots below, we see that for many items, IICs seem uninterpretable; as ability level increases, respondents have a lower probability of selecting the correct answer.

# Run for CALVI
df.irt_wide = 
  df.irt %>%
  # filter for CALVI
  filter(test_name == "CALVI") %>%
  dplyr::select(participant_id, item_id, correct_response) %>%
  pivot_wider(names_from = item_id,
              values_from = correct_response) %>%
  dplyr::select(-c(participant_id))

# run 2pl model
model.2pl =
  # predict by difficulty + ability
  ltm(df.irt_wide ~ z1,
      # print IRT parameters (coefficient estimates of item difficulty and discrimination)
      IRT.param = TRUE,
      # 1000 iterations to yield estimates
      control = list(iter.em = 1000))

# item characteristic curves
icc_calvi =
  as.ggplot(~plot(model.2pl,
     type = "ICC", zrange = c(-4, 4),
     main = "CALVI"))

icc_calvi

# item information plot
plot(model.2pl,
     type = "IIC", zrange = c(-4, 4))

# Wright map
fit_2pl_tam =
  tam.mml.2pl(df.irt_wide)

IRT.WrightMap(fit_2pl_tam)

# plot all iccs together
icc_plots =
  list(
  icc_wan,
  icc_ggr,
  icc_brbf,
  icc_vlat,
  icc_calvi
)

icc_plot =
  icc_wan +
  icc_ggr + 
  icc_brbf +
  icc_vlat +
  icc_calvi +
  plot_layout(ncol = 5) +
  plot_annotation(
    title = "ICCs by Test",
    theme = theme(plot.title = element_text(hjust = 0.5))
  )

print(icc_plot)

# export plot
ggsave("icc_all_tests.png",
       icc_plot,
       width = 18, height = 6)

3.5 RQ4: How well does a model simulating a low-literacy, guessing participant perform?

The item discrimination and item information analyses suggest that even students with relatively low data visualization literacy have a non-negligible chance of getting the correct response on various items.

Next, we run a model representing a student with low data visualization literacy who “guesses” the answer to a given item in a semi-informed manner (e.g., using an exclusion strategy or similar).

3.5.1 Data Wrangling

# define dataset
df.irt_wide = 
  df.irt %>%
  # keep only items and response values
  dplyr::select(participant_id, item_id, correct_response) %>%
  # create matrix containing only 1s and 0s for correct/incorrect responses
  pivot_wider(names_from = item_id,
              values_from = correct_response) %>%
  # remove participant id
  dplyr::select(-c(participant_id))
# create df with column representing binary vs MC nature of each item
df.binaryvsmc =
  df.irt %>% 
  dplyr::select(item_id, possible_responses, correct_response) %>% 
  distinct() %>%
  # count the response options for each item
  mutate(n_options = str_count(possible_responses, ",") + 1,
         item_type = if_else(n_options == 2, "binary", "mc"))

# obtain lists of item ids for binary and mc items
# binary items
binary_items =
  df.binaryvsmc %>%
  filter(item_type == "binary") %>%
  pull(item_id)

# mc items
mc_items =
  df.binaryvsmc %>%
  filter(item_type == "mc") %>%
  pull(item_id)

3.5.2 Modeling

3.5.2.1 Binary Items: IRT-based model

For binary items, we use a 2PL model where we set the latent ability (or theta) to ~1 SD below the average to simulate a low-literacy test-taker.

For any item whose difficulty is around –1, the 2-PL yields about a coin-flip chance of obtaining the correct response.

\[P(\text{correct}\mid\theta=-1) \; =\; \text{logit}^{-1}\!\bigl(a(\theta-b)\bigr) \approx 0.50.\]

For easier items (difficulty < –1) the model lets the learner do better than chance, and for harder items (b > –1) it lets them do worse.

# fit 2pl model to generate predictions for binary items
model.2pl =
  # predict by difficulty + ability
  ltm(df.irt_wide ~ z1,
      # print IRT parameters (coefficient estimates of item difficulty and discrimination)
      IRT.param = TRUE,
      # 1000 iterations to yield estimates
      control = list(iter.em = 1000))

# pass coefficients from model as parameters
pars = coef(model.2pl)

# define logistic function, where a = discrimination and b = difficulty
logistic =
  function(theta, a, b) 1 / (1 + exp(-a * (theta - b)))

# define data viz literacy as 1 SD below the mean (theta = -1)
theta = -1

# obtain discrimination and difficulty for binary items
P_binary =
  logistic(theta,
           pars[binary_items, "Dscrmn"],
           pars[binary_items, "Dffclt"])

# generate predictions for each item
pred_binary =
  tibble(
    item_id = names(P_binary),
    proportion_correct = round(100 * P_binary, 2)
    )

3.5.2.2 MC Items: Exclusion-strategy guesser

For multiple-choice items, we assume that the student follows an exclusion strategy: They can rule out a non-negligible number of options at a coin-flip rate, and then guess among the remaining options.

# predictions for MC items

# define parameters
theta = .5 # chance the simulated student eliminates a given response option
elim_prop  = .75 # student eliminates max 75 % of wrong options
n_sim = 426 # we simulate N = 426 respondents to match our data

# define function to get prediction for each MC item
get_mc_pred =
  function(opts_string, n_sim, theta, elim_prop) {
    # count the number of response options for each item
    n_options = str_count(opts_string, ",") + 1
    # define the number of wrong options as number of options minus 1
    k_wrong = n_options - 1
    if (k_wrong == 0) return(1)
    # set a dynamic maximum of eliminated options (≤ 75% of options)
    max_elim = floor(elim_prop * k_wrong)
    # set chance the student successfully eliminates any single wrong option based on parameters above
    n_elim = pmin(rbinom(n_sim, k_wrong, theta), max_elim)
    # define the remaining wrong options as the number of wrong options minus eliminated options
    remain_wrong = k_wrong - n_elim
    # calculate expected probability of selecting correct response
    mean(1 / (remain_wrong + 1))
  }

# generate predictions for each mc item
pred_mc =
  df.binaryvsmc %>% 
  # filter for mc items
  filter(item_type == "mc") %>% 
  distinct(item_id, possible_responses) %>%
  mutate(
    proportion_correct = round(
      100 * map_dbl(
        possible_responses,
        # run prediction function using three parameters
        ~ get_mc_pred(.x,
                      n_sim = n_sim,
                      theta = theta,
                      elim_prop = elim_prop)
        ), 2) # round to two digits
  ) %>% 
  # keep only relevant cols
  dplyr::select(item_id, proportion_correct)
# merge datasets
prediction_final =
  bind_rows(
    pred_binary %>%
      mutate(item_id = as.numeric(item_id)),
    pred_mc %>%
      mutate(item_id = as.numeric(item_id))
    ) %>% 
  arrange(item_id)
# create dataset with observed % correct values per item
observed_final =
  df.irt %>%
  group_by(item_id) %>% 
  summarise(actual_prop = 100 * mean(correct_response, na.rm = TRUE),
            test_name   = first(test_name),
            .groups = "drop") %>% 
  mutate(item_id = as.numeric(item_id))

# create new dataset merging observed proportions with predicted proportions
results =
  prediction_final %>% 
  rename(predicted_prop = proportion_correct) %>% 
  left_join(observed_final, by = "item_id")

3.5.2.3 Viz 5: Predicted vs Observed Item Difficulty

correlation = cor.test(results$predicted_prop, results$actual_prop)

# plot
results %>%
  ggplot(aes(x = predicted_prop,
             y = actual_prop,
             color = test_name)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed",
              color = "darkblue", size = 1) +
  scale_color_brewer(palette = "Set1", name = "Test") +
  annotate("text", x = 100, y = 5,
         label = paste0("r = ", round(correlation$estimate, 3)),
         hjust = 1, vjust = 0,
         size = 7, fontface = "bold") +
  labs(
    title  = "Predicted vs. Observed Item Difficulty (measured as % correct)",
    subtitle = "Individual dots represent individual test items",
    x = "Predicted % correct ('low-literacy guessing student' model)",
    y = "Observed % correct (N = 426 subjects)"
  )

# export plot
ggsave((paste0(figures_dir,
               "guessing_model.png")),
       width = 10, height = 6, device = "png")

3.5.2.4 Model Statistics

The correlation analysis shows that the model predictions are significantly but only weakly correlated with the observed data. The RMSE evaluation shows that the low-literacy guessing model performs worse than a baseline model that fits just the mean. In other words, the guessing model is not a good explanation for the data at hand; it is highly unlikely that high-performing respondents achieved a large number of correct responses by simply doing (informed) guessing.

r(200) = .22, 95% CIs = [.09, .35], p = .002

RMSE_model = 35.17 vs RMSE_baseline = 27.22 (scale: whole percentage points, 0-100)

# Simple correlation
correlation = cor.test(results$predicted_prop, results$actual_prop)
print(correlation)
## 
##  Pearson's product-moment correlation
## 
## data:  results$predicted_prop and results$actual_prop
## t = 3.1997, df = 200, p-value = 0.0016
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.08522276 0.34812299
## sample estimates:
##       cor 
## 0.2206775
# Calculate RMSE
# RMSE of fitting the mean
baseline_rmse =
  sqrt(mean((results$actual_prop - mean(results$actual_prop))^2))

print(baseline_rmse)
## [1] 27.21975
# RMSE of model
rmse =
  sqrt(mean((results$predicted_prop - results$actual_prop)^2,
            na.rm = TRUE))

print(rmse)
## [1] 35.17515
cat("Baseline RMSE =", round(baseline_rmse, 1),"%\n",
    "Model   RMSE =", round(rmse, 1),"%\n")
## Baseline RMSE = 27.2 %
##  Model   RMSE = 35.2 %

4 Predicting the Held-Out Data

We have a held-out 20% of data that contains entirely new items. We now have the models above make predictions for that held-out dataset (predicting proportion correct for these new items).

4.1 Data Wrangling

# Import held out data with participant info
df.heldout =
  read.csv(paste0(data_dir, "test_data.csv"))

# add column with binary vs MC information
df.heldout_binaryvsmc =
  df.heldout %>% 
  dplyr::select(item_id, possible_responses) %>% 
  distinct() %>%
  # count the response options for each item
  mutate(n_options = str_count(possible_responses, ",") + 1,
         item_type = if_else(n_options == 2, "binary", "mc"))

# pull IDs for binary items
binary_items =
  df.heldout_binaryvsmc %>%
  filter(item_type == "binary") %>%
  pull(item_id)

# pull IDs for mc items
mc_items =
  df.heldout_binaryvsmc %>%
  filter(item_type == "mc") %>%
  pull(item_id)

4.2 Modeling

4.2.1 Binary Items: IRT-based model

# run predictions for binary items
theta = -1

P_binary =
  logistic(theta,
           pars[binary_items, "Dscrmn"],
           pars[binary_items, "Dffclt"])

pred_binary_heldout =
  tibble(
    item_id = binary_items,
    predicted_prop = round(100 * P_binary, 2)
)

4.2.2 MC Items: Exclusion-strategy guesser

# run predictions for MC items
# define parameters
theta = .5 # chance the simulated student eliminates a given response option
elim_prop  = .75 # student eliminates max 75 % of wrong options
n_sim = 426 # we simulate N = 426 respondents to match our data

# define function to get prediction for each MC item
get_mc_pred =
  function(opts_string, n_sim, theta, elim_prop) {
    # count the number of response options for each item
    n_options = str_count(opts_string, ",") + 1
    # define the number of wrong options as number of options minus 1
    k_wrong = n_options - 1
    if (k_wrong == 0) return(1)
    # set a dynamic maximum of eliminated options (≤ 75% of options)
    max_elim = floor(elim_prop * k_wrong)
    # set chance the student successfully eliminates any single wrong option based on parameters above
    n_elim = pmin(rbinom(n_sim, k_wrong, theta), max_elim)
    # define the remaining wrong options as the number of wrong options minus eliminated options
    remain_wrong = k_wrong - n_elim
    # calculate expected probability of dplyr::selecting correct response
    mean(1 / (remain_wrong + 1))
  }

# generate predictions for mc items
pred_mc_heldout =
  df.heldout_binaryvsmc %>% 
  filter(item_type == "mc") %>% 
  distinct(item_id, possible_responses) %>%
  mutate(
    predicted_prop = round(
      100 * map_dbl(
        possible_responses,
        # run prediction function using three parameters
        ~ get_mc_pred(.x,
                      n_sim = n_sim,
                      theta = theta,
                      elim_prop = elim_prop)
        ), 2) # round to two digits
  ) %>% 
  # keep only relevant cols
  dplyr::select(item_id, predicted_prop)
# merge datasets
export =
  bind_rows(
    pred_binary_heldout,
    pred_mc_heldout) %>% 
  mutate(predicted_prop = predicted_prop / 100) %>%
  rename(proportion_correct = predicted_prop) %>%
  arrange(item_id)

# export predictions
readr::write_csv(export, paste0(data_dir, "predictions.csv"))

4.2.3 Viz 6: Predicted vs Observed Item Difficulty

results_heldout =
  df.heldout %>%
  dplyr::select(item_id, proportion_correct, test_name) %>%
  rename(actual_prop = proportion_correct) %>%
  left_join(export) %>%
  rename(predicted_prop = proportion_correct)

# plot
results_heldout %>%
  ggplot(aes(x = (predicted_prop*100),
             y = (actual_prop*100),
             color = test_name)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed",
              color = "darkblue", size = 1) +
  scale_color_brewer(palette = "Set1", name = "Test") +
  ylim(0, 100) +
  xlim(0, 100) +
  labs(
    title  = "HELD-OUT DATA: Predicted vs. Observed Item Difficulty (measured as % correct)",
    subtitle = "Individual dots represent individual test items",
    x = "Predicted % correct ('low-literacy guessing student' model)",
    y = "Observed % correct (N = 426 subjects)"
  )

# export plot
ggsave((paste0(figures_dir,
               "guessing_model_heldout.png")),
       width = 12, height = 6, device = "png")